# Dickstein, Ho, and Mark (2023)
# The function in this script modifies the household-insurance option dataset into a format that can be used in the estimation
# and counterfactual processes.

acgquinsvec <- as.numeric(scan(file="~/sharedWork/oregon/Analysis_QL/orsg_rep/analysis/build/estimation/ml_indiv/data/acg_positive_quintiles", sep = ",", what = "character")[2:7])

mod_exploded_data <- function(data, output_path,
                              highmeanthresh = 0.59616,
                              lowsumthresh = 0.2575,
                              highsumthresh = 1.76329,
                              highmaxthresh = 3.37157,
                              medincome = 3,
                              acgquins,
                              winsorbound = 89.3707340916662,
                              ravec = 1:7,
                              yrvec = 2015:2016){
  
  print(paste0("Starting n observations: ", nrow(data)))
  print(paste0("Starting n subscriberids: ", length(unique(data$subscriberid))))
  data <- data %>% filter(best_guess_ra %in% ravec & year %in% yrvec)
  print(paste0("After rating area restriction implemented (n observations): ", nrow(data)))
  print(paste0("After rating are and year subscriberid (n subscriberids): ", length(unique(data$subscriberid))))

  if(! 2014 %in% yrvec){data$year_2014 <- 0}
  
  data <- data %>%
    mutate(mean_concurrent_risk = sum_concurrent_risk / (1 + ndeps + nspouse))
  
  mean_high_risk_threshold <- highmeanthresh
  print(paste0("High mean-risk threshold is: ", mean_high_risk_threshold))
  sum_low_risk_threshold <- lowsumthresh
  print(paste0("Low sum-risk threshold is: ", sum_low_risk_threshold))
  sum_high_risk_threshold <- highsumthresh
  print(paste0("High sum-risk threshold is: ", sum_high_risk_threshold))
  max_high_risk_threshold <- highmaxthresh
  print(paste0("High max-risk threshold is: ", max_high_risk_threshold))
  median_income <- medincome
  
  data <- data %>%
    mutate(orig_subs_id = subscriberid,
           subscriberid = subscriberid,
           subscriberid_year = paste0(subscriberid, "_", year),
           max_dev_risk = max_concurrent_risk - mean_concurrent_risk,
           # ACG top-bottom indicator
           high_risk = if_else(mean_concurrent_risk >= mean_high_risk_threshold, 1, 0),
           # Indicator for having max ACG landing in top quartile of max ACGs
           max_acg_high = if_else(max_concurrent_risk >= max_high_risk_threshold, 1, 0),
           # Bottom ACG
           low_sum_risk =  if_else(sum_concurrent_risk <= sum_low_risk_threshold, 1, 0),
           high_sum_risk =  if_else(sum_concurrent_risk >= sum_high_risk_threshold, 1, 0),
           # ACG slope conditional on low-sum-risk
           sum_risk_cond_nolow =  if_else(low_sum_risk == 1, 0, sum_concurrent_risk),
           # Family type
           withkids = if_else(withkids == "TRUE", 1, 0),
           nokids = 1 - withkids,
           married = if_else(married == "TRUE", 1, 0),
           single = 1 - married,
           married_withkids = if_else(withkids * married == 1, 1, 0),
           single_withkids = if_else(single * married == 1, 1, 0),
           # Partial enrollment indicator
           partial_enrollment = if_else(nummonths_span <= 10, 1, 0),
           # Premium and cost are in 100's of dollars.
           p = best_guess_netprem / 100,
           cost  = totpaidmonth_span / 100,
           over40 = as.numeric(age >= 40),
           under40 = 1 - over40,
           over50 = as.numeric(age >=50),
           under50 = 1 - over50,
           top_income = as.numeric(best_guess_incomeoverFPL >= median_income),
           low_income = as.numeric(best_guess_incomeoverFPL < median_income),
           subsidized = as.numeric(best_guess_Subsidy > 0),
           unsubsidized = as.numeric(best_guess_Subsidy  == 0),
           nocost = if_else((cost == 0) , 1, 0),
           zeroacg = if_else((sum_concurrent_risk == 0) , 1, 0),
           # Managed care indicator
           mnc  = if_else(MNC == "MNC", 1, 0),
           not_mnc  = 1 - mnc,
           # Actuarial Value
           x_av  = best_guess_AV / 100,
           insurance = as.numeric(x_av > 0),
           uninsured = as.numeric(x_av == 0),
           # If no insurance, cost has to be 0
           cost = cost * insurance,
           metal = substr(constructed_plan_year, 8, 8),
           portland = (best_guess_ra == 1),
           nominal_silver = (metal == 3),
           silver_sub = (nominal_silver & subsidized),
           silver_costshare = (nominal_silver & (best_guess_incomeoverFPL) <= 2.5),
           raw_cost = cost)
  print(nrow(data))
  
  # Set cost for households with zero ACG score equal to zero.
  data <- data %>%
    mutate(cost = case_when(
      (sum_concurrent_risk == 0) ~ 0,
      (sum_concurrent_risk != 0) ~ cost
    ))
  
  # Winsorize cost
  cost_upper_winsor <- winsorbound
  print(paste0("Winsorizing cost at ", cost_upper_winsor))
  data <- data %>%
    mutate(cost = case_when(
      (cost < cost_upper_winsor) ~ cost,
      (cost >= cost_upper_winsor) ~ cost_upper_winsor
    ))
  
  # Generate ACG quintiles dummies. We take zero indicator + quintiles for non-zero ACG.
  acg_positive_quintiles <- acgquins
  data <- data %>%
    mutate(sum_acg_quintiles = cut(
      sum_concurrent_risk, c(0, acg_positive_quintiles), include.lowest= TRUE, labels = FALSE))
  data <- fastDummies::dummy_cols(data, select_column = "sum_acg_quintiles")
  
  print("Generate payer-portland-year dummies")
  data <- data %>% mutate(payer_ra_year = paste0(PAYER_ID, "_", year, "_", portland))
  data <- fastDummies::dummy_cols(data, select_column = c("payer_ra_year"))
  
  print("Generate year-ID dummies")
  data <- fastDummies::dummy_cols(data, select_column = "year") %>%
    select(-year_2014) # drop dummy for outside opt
  
  print("Generate payer-ID dummies")
  data <- fastDummies::dummy_cols(data, select_column = "PAYER_ID") %>%
    select(-PAYER_ID_0) # drop dummy for outside opt
  
  
  # Create a constant and zero variable
  data$cons <- 1
  data$zero <- 0

  # Report size of data in the end 
  print(paste0("Ending n observations: ", nrow(data)))
  print(paste0("Starting n subscriberids: ", length(unique(data$subscriberid))))
  return(data)
}
